library(here)
## here() starts at /home/peter/SCORE/C075_Grant_Coultas
sce <- readRDS(here("data/SCEs/C075_Grant_Coultas.scPipe.SCE.rds"))
library(scater)
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
as.data.frame(colData(sce)) %>%
group_by(plate_number, cell_type_descriptor, sample_name) %>%
count() %>%
knitr::kable()
| LC279 |
Control sample # 1577 |
20 cells |
3 |
| LC279 |
Control sample # 1577 |
single cell |
185 |
| LC279 |
Mutant sample # 1576 |
20 cells |
3 |
| LC279 |
Mutant sample # 1576 |
single cell |
185 |
| LC280 |
Control sample # 1577 |
20 cells |
3 |
| LC280 |
Control sample # 1577 |
single cell |
185 |
| LC280 |
Mutant sample # 1576 |
20 cells |
3 |
| LC280 |
Mutant sample # 1576 |
single cell |
185 |
| LC294 |
Control sample # 1611 |
20 cells |
3 |
| LC294 |
Control sample # 1611 |
single cell |
185 |
| LC294 |
Mutant sample # 1609 |
20 cells |
3 |
| LC294 |
Mutant sample # 1609 |
single cell |
185 |
| LC358 |
Control sample # 1696 |
20 cells |
3 |
| LC358 |
Control sample # 1696 |
single cell |
184 |
| LC358 |
Mutant sample # 1695 |
20 cells |
3 |
| LC358 |
Mutant sample # 1695 |
single cell |
182 |
| LC392 |
Control sample # 1743 |
20 cells |
3 |
| LC392 |
Control sample # 1743 |
single cell |
185 |
| LC392 |
Mutant sample # 1747 |
20 cells |
3 |
| LC392 |
Mutant sample # 1747 |
single cell |
185 |
| LC396 |
Control sample # 1824 |
20 cells |
2 |
| LC396 |
Control sample # 1824 |
single cell |
112 |
| LC396 |
Mutant sample # 1819 |
20 cells |
3 |
| LC396 |
Mutant sample # 1819 |
single cell |
185 |
| LC398 |
Control sample # 1824 |
20 cells |
2 |
| LC398 |
Control sample # 1824 |
single cell |
55 |
| LC398 |
Mutant sample # 1823 |
20 cells |
3 |
| LC398 |
Mutant sample # 1823 |
single cell |
185 |
library(EnsDb.Mmusculus.v79)
## Loading required package: ensembldb
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: AnnotationFilter
##
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
##
## filter
## The following object is masked from 'package:stats':
##
## filter
ensembl <- gsub("\\.[0-9]+$", "", rownames(sce))
symb <- mapIds(
x = EnsDb.Mmusculus.v79,
# NOTE: Need to remove gene version number prior to lookup.
keys = ensembl,
keytype = "GENEID",
column = "SYMBOL")
## Warning: Unable to map 6238 of 34297 requested IDs.
rowData(sce)$ENSEMBL <- ensembl
rowData(sce)$SYMBOL <- symb
# Replace the row names of the SCE by the gene symbols (where available).
library(scater)
rownames(sce) <- uniquifyFeatureNames(rownames(sce), rowData(sce)$SYMBOL)
# Add chromosome location so we can filter on mitochondrial genes.
location <- mapIds(
x = EnsDb.Mmusculus.v79,
# NOTE: Need to remove gene version number prior to lookup.
keys = rowData(sce)$ENSEMBL,
keytype = "GENEID",
column = "SEQNAME")
## Warning: Unable to map 6238 of 34297 requested IDs.
rowData(sce)$CHR <- location
is_mito <- rowData(sce)$CHR == "MT"
sce <- addPerCellQC(sce, subsets = list(Mt = which(is_mito)))
sce$sample <- paste0(
sce$plate_number,
".",
stringr::str_extract(sce$cell_type_descriptor, "# [0-9]+"))
sce$group <- ifelse(grepl("Mutant", sce$cell_type_descriptor), "mutant", "control")
plotColData(
sce,
y = "sum",
x = "sample",
colour_by = "group",
other_fields = "sample_name") +
scale_y_log10() +
facet_grid(~sample_name) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
sce,
y = "detected",
x = "sample",
colour_by = "group",
other_fields = "sample_name") +
scale_y_log10() +
facet_grid(~sample_name) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
sce,
y = "altexps_ERCC_percent",
x = "sample",
colour_by = "group",
other_fields = "sample_name") +
ylim(0, 100) +
facet_grid(~sample_name) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

plotColData(
sce,
y = "subsets_Mt_percent",
x = "sample",
colour_by = "group",
other_fields = "sample_name") +
ylim(0, 100) +
facet_grid(~sample_name) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

p <- lapply(sort(unique(sce$plate_number)), function(p) {
z <- sce[, sce$plate_number == p]
plotPlatePosition(
z,
as.character(z$well_position),
colour_by = "altexps_ERCC_percent",
point_size = 2,
point_alpha = 1,
theme_size = 5,
shape_by = "group",
by_show_single = TRUE) +
ggtitle(p) +
scale_colour_viridis_c(
limits = c(0, 60),
breaks = seq(0, 60, 10)) +
scale_shape_manual(values = c(control = 16, mutant = 17)) +
guides(shape = FALSE)
})
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
multiplot(plotlist = p, cols = 3)

# Check expression of mutant gene (Kat7)
plotExpression(
sce,
"Kat7",
x = "sample",
exprs_values = "counts",
colour_by = "group") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

table(sce$cell_type_descriptor, counts(sce)["Kat7", ] > 0)
##
## FALSE TRUE
## Control sample # 1577 342 34
## Control sample # 1611 157 31
## Control sample # 1696 156 31
## Control sample # 1743 154 34
## Control sample # 1824 143 28
## Mutant sample # 1576 354 22
## Mutant sample # 1609 178 10
## Mutant sample # 1695 165 20
## Mutant sample # 1747 170 18
## Mutant sample # 1819 182 6
## Mutant sample # 1823 182 6
sce <- sce[, sce$sample_name == "single cell"]
sce$sample <- factor(sce$sample)
sce$sample_name <- factor(sce$sample_name)
libsize_drop <- isOutlier(
metric = sce$sum,
nmads = 3,
type = "lower",
log = TRUE)
feature_drop <- isOutlier(
metric = sce$detected,
nmads = 3,
type = "lower",
log = TRUE)
spike_drop <- isOutlier(
metric = sce$altexps_ERCC_percent,
nmads = 5,
type = "higher")
sce_pre_QC_outlier_removal <- sce
sce <- sce[, !(libsize_drop | feature_drop | spike_drop)]
data.frame(
ByLibSize = tapply(
libsize_drop,
sce_pre_QC_outlier_removal$plate_number,
sum),
ByFeature = tapply(
feature_drop,
sce_pre_QC_outlier_removal$plate_number,
sum),
BySpike = tapply(
spike_drop,
sce_pre_QC_outlier_removal$plate_number,
sum),
Remaining = as.vector(unname(table(sce$plate_number)))) %>%
knitr::kable(
caption = "Number of samples removed by each QC step and the number of samples remaining.")
Number of samples removed by each QC step and the number of samples remaining.
| LC279 |
44 |
59 |
1 |
311 |
| LC280 |
3 |
6 |
0 |
364 |
| LC294 |
2 |
4 |
0 |
366 |
| LC358 |
1 |
2 |
0 |
364 |
| LC392 |
2 |
2 |
0 |
368 |
| LC396 |
8 |
10 |
3 |
286 |
| LC398 |
27 |
34 |
45 |
187 |
cbind(
"pre-QC" = table(sce_pre_QC_outlier_removal$cell_type_descriptor),
"post-QC" = table(sce$cell_type_descriptor))
## pre-QC post-QC
## Control sample # 1577 370 363
## Control sample # 1611 185 182
## Control sample # 1696 184 182
## Control sample # 1743 185 184
## Control sample # 1824 167 161
## Mutant sample # 1576 370 312
## Mutant sample # 1609 185 184
## Mutant sample # 1695 182 182
## Mutant sample # 1747 185 184
## Mutant sample # 1819 185 177
## Mutant sample # 1823 185 135
plotHighestExprs(sce, n = 50)

plotHighestExprs(sce[!grepl("^mt|^Rpl|^Rps", rownames(sce))], n = 50)

ave_counts <- calcAverage(sce, use_size_factors = FALSE)
## Warning: 'calcAverage' is deprecated.
## Use 'calculateAverage' instead.
## See help("Deprecated")
## Warning: 'use_size_factors=FALSE' is deprecated.
## See help("Deprecated")
par(mfrow = c(1, 1))
hist(
x = log10(ave_counts),
breaks = 100,
main = "",
col = "grey",
xlab = expression(Log[10] ~ "average count"))

# NOTE: I've opted to filter out genes with zero counts across all data sets
# rather than a per-data set basis. This means that each data set continues
# to have the same set of genes after this filter is applied.
to_keep <- ave_counts > 0
table(to_keep)
## to_keep
## FALSE TRUE
## 417 33880
sce <- sce[to_keep, ]
library(scran)
set.seed(1011220)
clusters <- quickCluster(
sce,
min.size = 60,
use.ranks = FALSE,
BSPARAM = BiocSingular::IrlbaParam())
table(clusters)
## clusters
## 1 2 3 4 5 6 7 8 9
## 462 354 441 338 91 142 154 175 89
sce <- computeSumFactors(
sce,
clusters = clusters,
min.mean = 0.1)
summary(sizeFactors(sce))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1618 0.6032 0.9084 1.0000 1.2747 5.1049
xlim <- c(1, max(sce$sum) / 1e3)
ylim <- range(sizeFactors(sce))
par(mfrow = c(3, 3))
lapply(sort(unique(sce$plate_number)), function(p) {
sce <- sce[, sce$plate_number == p]
plot(
x = sce$sum / 1e3,
y = sizeFactors(sce),
log = "xy",
xlab = "Library size (thousands)",
ylab = "Size factor",
main = p,
xlim = xlim,
ylim = ylim,
pch = 16)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
sce <- computeSpikeFactors(sce, spikes = "ERCC", general.use = FALSE)
sce <- logNormCounts(sce)
var.out <- modelGeneVarWithSpikes(sce, "ERCC")
par(mfrow = c(1, 1))

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
ylab="Variance of log-expression")
fit <- metadata(var.out)
points(fit$mean, fit$var, col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)

chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10]
plotExpression(sce, rownames(var.out)[chosen.genes])

set.seed(1000)
sce <- denoisePCA(sce, technical=fit$trend, BSPARAM=BiocSingular::IrlbaParam())
ncol(reducedDim(sce, "PCA"))
## [1] 5
set.seed(1000)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=50)
## Warning: 'use_dimred' is deprecated.
## Use 'dimred' instead.
## See help("Deprecated")
plotTSNE(sce, colour_by = "group")

plotTSNE(sce, colour_by = "cell_type_descriptor")

plotTSNE(sce, colour_by = "plate_number")

plotTSNE(sce, colour_by = "subsets_Mt_percent")

# TODO: Figure out if its MT, X, and/or Y that's causing the clustering.
# NOTE: Remove MT, X, and Y and re-do t-SNE.
sce <- sce[which(!rowData(sce)$CHR %in% c("MT", "X", "Y")), ]
# sce <- sce[which(!rowData(sce)$CHR %in% c("MT")), ]
set.seed(1011220)
clusters <- quickCluster(
sce,
min.size = 60,
use.ranks = FALSE,
BSPARAM = BiocSingular::IrlbaParam())
table(clusters)
## clusters
## 1 2 3 4 5 6 7 8 9 10 11
## 403 233 351 209 196 95 349 83 96 149 82
sce <- computeSumFactors(
sce,
clusters = clusters,
min.mean = 0.1)
summary(sizeFactors(sce))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1655 0.6099 0.9091 1.0000 1.2696 5.3658
xlim <- c(1, max(sce$sum) / 1e3)
ylim <- range(sizeFactors(sce))
par(mfrow = c(3, 3))
lapply(sort(unique(sce$plate_number)), function(p) {
sce <- sce[, sce$plate_number == p]
plot(
x = sce$sum / 1e3,
y = sizeFactors(sce),
log = "xy",
xlab = "Library size (thousands)",
ylab = "Size factor",
main = p,
xlim = xlim,
ylim = ylim,
pch = 16)
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
sce <- computeSpikeFactors(sce, spikes = "ERCC", general.use = FALSE)
sce <- logNormCounts(sce)
var.out <- modelGeneVarWithSpikes(sce, "ERCC")
par(mfrow = c(1, 1))

plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression",
ylab="Variance of log-expression")
fit <- metadata(var.out)
points(fit$mean, fit$var, col="red", pch=16)
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)

chosen.genes <- order(var.out$bio, decreasing=TRUE)[1:10]
plotExpression(sce, rownames(var.out)[chosen.genes])

set.seed(1000)
sce <- denoisePCA(sce, technical=fit$trend, BSPARAM=BiocSingular::IrlbaParam())
ncol(reducedDim(sce, "PCA"))
## [1] 5
set.seed(1000)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=50)
## Warning: 'use_dimred' is deprecated.
## Use 'dimred' instead.
## See help("Deprecated")
plotTSNE(sce, colour_by = "group")

plotTSNE(sce, colour_by = "cell_type_descriptor")

plotTSNE(sce, colour_by = "plate_number")

plotTSNE(sce, colour_by = "subsets_Mt_percent")

snn.gr <- buildSNNGraph(sce, use.dimred="PCA")
cluster.out <- igraph::cluster_walktrap(snn.gr)
my.clusters <- cluster.out$membership
table(my.clusters)
## my.clusters
## 1 2 3 4 5 6 7 8 9 10
## 152 283 193 415 401 392 94 151 135 30
sce$cluster <- factor(my.clusters)
plotTSNE(sce, colour_by="cluster")

library(SingleR)
mouse_se <- MouseRNAseqData()
## snapshotDate(): 2019-10-22
## see ?SingleR and browseVignettes('SingleR') for documentation
## loading from cache
## see ?SingleR and browseVignettes('SingleR') for documentation
## loading from cache
as.data.frame(colData(mouse_se)) %>%
tabyl(label.main) %>%
adorn_pct_formatting(1) %>%
knitr::kable(
caption = "Summary of main cell types in the `MouseRNAseqData` reference set.")
Summary of main cell types in the MouseRNAseqData reference set.
| Adipocytes |
13 |
3.6% |
| Astrocytes |
27 |
7.5% |
| B cells |
5 |
1.4% |
| Cardiomyocytes |
8 |
2.2% |
| Dendritic cells |
2 |
0.6% |
| Endothelial cells |
12 |
3.4% |
| Epithelial cells |
2 |
0.6% |
| Erythrocytes |
3 |
0.8% |
| Fibroblasts |
45 |
12.6% |
| Granulocytes |
15 |
4.2% |
| Hepatocytes |
4 |
1.1% |
| Macrophages |
32 |
8.9% |
| Microglia |
72 |
20.1% |
| Monocytes |
6 |
1.7% |
| Neurons |
64 |
17.9% |
| NK cells |
18 |
5.0% |
| Oligodendrocytes |
12 |
3.4% |
| T cells |
18 |
5.0% |
rmarkdown::paged_table(
as.data.frame(colData(mouse_se)) %>%
dplyr::count(label.main, label.fine) %>%
dplyr::arrange(label.main))
pred_mouse_cell_main <- SingleR(
test = sce,
ref = mouse_se,
labels = mouse_se$label.main,
BPPARAM = SerialParam())
tabyl(data.frame(label.main = pred_mouse_cell_main$labels), label.main) %>%
adorn_pct_formatting(digits = 1) %>%
dplyr::arrange(desc(n)) %>%
knitr::kable(
caption = "Cell label assignments using the main labels of the `MouseRNAseqData` reference data.")
Cell label assignments using the main labels of the MouseRNAseqData reference data.
| Endothelial cells |
2244 |
99.9% |
| B cells |
1 |
0.0% |
| Macrophages |
1 |
0.0% |
stopifnot(identical(rownames(pred_mouse_cell_main), colnames(sce)))
plotScoreHeatmap(
pred_mouse_cell_main,
annotation_col = data.frame(
cluster = sce$cluster,
sample = sce$cell_type_descriptor,
row.names = rownames(pred_mouse_cell_main)))

pred_mouse_cell_fine <- SingleR(
test = sce,
ref = mouse_se,
labels = mouse_se$label.fine,
BPPARAM = SerialParam())
tabyl(data.frame(label.fine = pred_mouse_cell_fine$labels), label.fine) %>%
adorn_pct_formatting(digits = 1) %>%
dplyr::arrange(desc(n)) %>%
knitr::kable(
caption = "Cell label assignments using the fine labels of the `MouseRNAseqData` reference data.")
Cell label assignments using the fine labels of the MouseRNAseqData reference data.
| Endothelial cells |
2244 |
99.9% |
| Hepatocytes |
2 |
0.1% |
stopifnot(identical(rownames(pred_mouse_cell_main), colnames(sce)))
plotScoreHeatmap(
pred_mouse_cell_fine,
annotation_col = data.frame(
cluster = sce$cluster,
sample = sce$cell_type_descriptor,
row.names = rownames(pred_mouse_cell_fine)))

genes_of_interest <- c(
"Col4a1", "Col4a2", "Epas1", "Cdh5", "Ptprb", "Pecam1", "Vwf", "Itgb1",
"Calcrl", "Plvap", "Tie1", "Cldn5", "Acvrl1", "Eng", "Kdr", "Kat7")
plotExpression(sce, genes_of_interest, x = "cell_type_descriptor", colour_by = "group")

data.frame(
gene = genes_of_interest,
median_expression = signif(rowMedians(as.matrix(logcounts(sce)[genes_of_interest, ])), 3)) %>%
dplyr::arrange(desc(median_expression)) %>%
knitr::kable()
| Col4a1 |
4.480 |
| Cldn5 |
4.410 |
| Col4a2 |
3.880 |
| Kdr |
3.290 |
| Eng |
2.710 |
| Itgb1 |
2.670 |
| Epas1 |
2.640 |
| Tie1 |
2.580 |
| Cdh5 |
2.520 |
| Ptprb |
2.510 |
| Pecam1 |
2.480 |
| Plvap |
1.010 |
| Vwf |
0.995 |
| Acvrl1 |
0.812 |
| Calcrl |
0.000 |
| Kat7 |
0.000 |
p <- lapply(genes_of_interest, function(g) {
plotTSNE(sce, colour_by = g) +
ggtitle(g)
})
multiplot(plotlist = p, cols = 4)
